# setwd("/Users/seantrott/Dropbox/UCSD/Research/NLMs/vlm-vit-num/analysis")
directory_path <- "../results"
csv_files <- list.files(path = directory_path, pattern = "*.csv", full.names = TRUE)
csv_list <- csv_files %>%
map(~ read_csv(.))
## New names:
## Rows: 8320 Columns: 13
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): model_name, image_type, image_1, image_2, numerosity_comparison_type dbl
## (8): ...1, cosine_similarity, numerosity_1, numerosity_2, area_diff, lay...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 24320 Columns: 13
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): model_name, image_type, image_1, image_2, numerosity_comparison_type dbl
## (8): ...1, cosine_similarity, numerosity_1, numerosity_2, area_diff, lay...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 32640 Columns: 13
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): model_name, image_type, image_1, image_2, numerosity_comparison_type dbl
## (8): ...1, cosine_similarity, numerosity_1, numerosity_2, area_diff, lay...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
df_hf_models <- bind_rows(csv_list)
df_hf_models = df_hf_models %>%
mutate(numerosity_diff = abs(numerosity_2 - numerosity_1)) %>%
group_by(model_name) %>%
mutate(max_layer = max(layer))
table(df_hf_models$model_name)
##
## clip-vit-base-patch32 clip-vit-large-patch14
## 24960 32000
## vit-base-patch16-224-in21k
## 8320
df_hf_models %>%
ggplot(aes(x = numerosity_diff)) +
geom_bar(stat = "count", alpha = .6) +
theme_minimal() +
labs(x = "Difference in Numerosity") +
theme(text = element_text(size = 15),
legend.position = "bottom")
df_hf_models %>%
ggplot(aes(x = area_diff)) +
geom_bar(stat = "count", alpha = .6) +
theme_minimal() +
labs(x = "Difference in Surface Area") +
theme(text = element_text(size = 15),
legend.position = "bottom")
df_hf_models %>%
ggplot(aes(x = cosine_similarity)) +
geom_histogram(alpha = .6) +
theme_minimal() +
labs(x = "Cosine Similarity") +
theme(text = element_text(size = 15)) +
facet_wrap(~model_name)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_summary <- df_hf_models %>%
mutate(area_diff_binned = ntile(area_diff, 4)) %>%
group_by(model_name, numerosity_comparison_type, layer) %>%
summarize(avg_similarity = mean(cosine_similarity, na.rm = TRUE),
se_similarity = sd(cosine_similarity, na.rm = TRUE) / sqrt(n()))
## `summarise()` has grouped output by 'model_name', 'numerosity_comparison_type'.
## You can override using the `.groups` argument.
df_summary %>%
filter(layer == 12) %>%
ggplot(aes(x = model_name,
y = avg_similarity,
fill = numerosity_comparison_type)) +
geom_bar(stat = "identity",
position = position_dodge(width = 0.5),
width = .6) +
geom_errorbar(aes(ymin = avg_similarity - se_similarity,
ymax = avg_similarity + se_similarity),
width = 0.2,
position = position_dodge(width = 0.5)) +
labs(# title = "",
x = "Model",
y = "Average Cosine Similarity",
fill = "") +
theme_minimal() +
coord_flip() +
scale_fill_viridis(option = "mako", discrete=TRUE) +
theme(axis.title = element_text(size=rel(1.2)),
axis.text = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
# legend.title = element_text(size = rel(1.5)),
strip.text.x = element_text(size = rel(1.2)),
legend.position = "bottom")
m1 = lmer(data = df_hf_models,
cosine_similarity ~ area_diff *layer + numerosity_comparison_type * layer +
(1 | image_1) + (1 | image_2) + (1 + numerosity_comparison_type|model_name))
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0475872 (tol = 0.002, component 1)
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: cosine_similarity ~ area_diff * layer + numerosity_comparison_type *
## layer + (1 | image_1) + (1 | image_2) + (1 + numerosity_comparison_type |
## model_name)
## Data: df_hf_models
##
## REML criterion at convergence: -302588.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -15.4489 -0.3298 0.1555 0.5089 3.0384
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## image_1 (Intercept) 1.188e-04 0.010900
## image_2 (Intercept) 8.973e-05 0.009473
## model_name (Intercept) 2.946e-04 0.017163
## numerosity_comparison_typesame 7.174e-05 0.008470 -1.00
## Residual 5.480e-04 0.023409
## Number of obs: 65280, groups: image_1, 480; image_2, 452; model_name, 3
##
## Fixed effects:
## Estimate Std. Error df t value
## (Intercept) 1.010e+00 9.968e-03 2.047e+00 101.318
## area_diff 1.465e-05 8.249e-06 7.264e+02 1.775
## layer -3.876e-03 3.016e-05 3.403e+04 -128.514
## numerosity_comparison_typesame -4.372e-03 4.947e-03 2.083e+00 -0.884
## area_diff:layer -1.371e-05 2.361e-07 6.459e+04 -58.076
## layer:numerosity_comparison_typesame 1.985e-03 3.140e-05 1.070e+03 63.216
## Pr(>|t|)
## (Intercept) 8.09e-05 ***
## area_diff 0.0762 .
## layer < 2e-16 ***
## numerosity_comparison_typesame 0.4668
## area_diff:layer < 2e-16 ***
## layer:numerosity_comparison_typesame < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ar_dff layer nmrs__ ar_df:
## area_diff -0.071
## layer -0.026 0.170
## nmrsty_cmp_ -0.987 -0.004 0.027
## are_dff:lyr 0.018 -0.256 -0.663 0.001
## lyr:nmrst__ 0.013 0.005 -0.516 -0.052 -0.018
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0475872 (tol = 0.002, component 1)
df_hf_models %>%
filter(layer == max_layer) %>%
ggplot(aes(x = numerosity_diff,
y = cosine_similarity)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_minimal() +
labs(# title = "",
x = "Numerosity Difference",
y = "Cosine Similarity",
fill = "") +
theme(text = element_text(size = 15),
legend.position = "bottom") +
facet_wrap(~model_name)
## `geom_smooth()` using formula = 'y ~ x'
### Run lm by layer
results <- df_hf_models %>%
# mutate(numerosity_diff = scale(numerosity_diff),
# cosine_similarity = scale(cosine_similarity)) %>%
dplyr::group_by(layer, model_name) %>%
dplyr::summarise(
model_summary = list(
broom::tidy(lm(cosine_similarity ~ numerosity_diff + area_diff, data = dplyr::cur_data()))
),
r_squared = summary(lm(cosine_similarity ~ numerosity_diff + area_diff, data = dplyr::cur_data()))$r.squared
) %>%
dplyr::mutate(
numerosity_diff_coef = purrr::map_dbl(model_summary, ~ .x %>% dplyr::filter(term == "numerosity_diff") %>% dplyr::pull(estimate)),
numerosity_diff_se = purrr::map_dbl(model_summary, ~ .x %>% dplyr::filter(term == "numerosity_diff") %>% dplyr::pull(std.error)),
area_diff_coef = purrr::map_dbl(model_summary, ~ .x %>% dplyr::filter(term == "area_diff") %>% dplyr::pull(estimate)),
area_diff_se = purrr::map_dbl(model_summary, ~ .x %>% dplyr::filter(term == "area_diff") %>% dplyr::pull(std.error))
) %>%
dplyr::select(layer, model_name, numerosity_diff_coef, numerosity_diff_se, area_diff_coef, area_diff_se, r_squared)
## Warning: There were 3 warnings in `dplyr::summarise()`.
## The first warning was:
## ℹ In argument: `model_summary = list(...)`.
## ℹ In group 1: `layer = 0`, `model_name = "clip-vit-base-patch32"`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## `summarise()` has grouped output by 'layer'. You can override using the
## `.groups` argument.
results %>%
ggplot(aes(x = layer, y = numerosity_diff_coef, fill = model_name)) +
geom_line(linetype = "dotted") + # Lineplot for mean entropy
geom_ribbon(aes(ymin = numerosity_diff_coef - numerosity_diff_se,
ymax = numerosity_diff_coef + numerosity_diff_se),
alpha = 0.5,
color = NA) + # Shading for SE
labs(
title = "",
x = "Layer",
y = "Coefficient (Numerosity)",
fill = "",
) +
theme_minimal() +
scale_x_continuous(limits = c(0, max(results$layer)),
breaks = seq(0, max(results$layer), 2)) +
theme(text = element_text(size = 15),
legend.position = "bottom") +
scale_color_viridis(option = "mako", discrete=TRUE)
results %>%
ggplot(aes(x = layer, y = area_diff_coef, fill = model_name)) +
geom_line(linetype = "dotted") + # Lineplot for mean entropy
geom_ribbon(aes(ymin = area_diff_coef - area_diff_se,
ymax = area_diff_coef + area_diff_se),
alpha = 0.5,
color = NA) + # Shading for SE
labs(
title = "",
x = "Layer",
y = "Coefficient (Area)",
fill = "",
) +
theme_minimal() +
scale_x_continuous(limits = c(0, max(results$layer)),
breaks = seq(0, max(results$layer), 2)) +
theme(text = element_text(size = 15),
legend.position = "bottom") +
scale_color_viridis(option = "mako", discrete=TRUE)
m1 = lmer(data = df_hf_models,
cosine_similarity ~ area_diff * layer + numerosity_diff *layer +
(1 | image_1) + (1 | image_2) + (1 | model_name))
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0020423 (tol = 0.002, component 1)
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: cosine_similarity ~ area_diff * layer + numerosity_diff * layer +
## (1 | image_1) + (1 | image_2) + (1 | model_name)
## Data: df_hf_models
##
## REML criterion at convergence: -307971.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -16.2860 -0.3511 0.1401 0.5089 2.8484
##
## Random effects:
## Groups Name Variance Std.Dev.
## image_1 (Intercept) 7.265e-05 0.008523
## image_2 (Intercept) 5.405e-05 0.007352
## model_name (Intercept) 1.676e-04 0.012947
## Residual 5.065e-04 0.022506
## Number of obs: 65280, groups: image_1, 480; image_2, 452; model_name, 3
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.005e+00 7.521e-03 2.046e+00 133.578 4.62e-05 ***
## area_diff 1.550e-05 6.645e-06 7.819e+02 2.332 0.0199 *
## layer -1.869e-03 2.687e-05 6.459e+04 -69.543 < 2e-16 ***
## numerosity_diff 5.195e-04 4.442e-05 8.944e+02 11.695 < 2e-16 ***
## area_diff:layer -1.380e-05 2.270e-07 6.458e+04 -60.809 < 2e-16 ***
## layer:numerosity_diff -1.832e-04 1.800e-06 6.458e+04 -101.773 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ar_dff layer nmrst_ ar_df:
## area_diff -0.077
## layer -0.031 0.223
## numrsty_dff -0.031 0.018 0.138
## are_dff:lyr 0.023 -0.305 -0.731 -0.006
## lyr:nmrsty_ 0.012 -0.005 -0.380 -0.362 0.017
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0020423 (tol = 0.002, component 1)
### Layer depth ratio
results %>%
group_by(model_name) %>%
mutate(max_layer = max(layer),
prop_layer = layer / max_layer) %>%
mutate(binned_prop_layer = ntile(prop_layer, 10)) %>%
mutate(prop_binned = binned_prop_layer / 10) %>%
ggplot(aes(x = prop_binned, y = numerosity_diff_coef, fill = model_name)) +
geom_line(linetype = "dotted") + # Lineplot for mean entropy
geom_ribbon(aes(ymin = numerosity_diff_coef - numerosity_diff_se,
ymax = numerosity_diff_coef + numerosity_diff_se),
alpha = 0.5,
color = NA) + # Shading for SE
labs(
title = "",
x = "Layer Depth",
y = "Coefficient (Numerosity)",
fill = "",
) +
theme_minimal() +
theme(text = element_text(size = 15),
legend.position = "bottom") +
scale_color_viridis(option = "mako", discrete=TRUE)
results %>%
group_by(model_name) %>%
mutate(max_layer = max(layer),
prop_layer = layer / max_layer) %>%
mutate(binned_prop_layer = ntile(prop_layer, 10)) %>%
mutate(prop_binned = binned_prop_layer / 10) %>%
ggplot(aes(x = prop_binned, y = numerosity_diff_coef)) +
stat_summary(
aes(group = model_name,
color = model_name),
fun = mean,
geom = "line",
size = 2
) +
stat_summary(
aes(group = model_name,
fill = model_name),
fun.data = mean_se,
geom = "ribbon",
alpha = 0.2,
color = NA
) +
theme_minimal() +
labs(
title = "",
x = "Layer Depth",
y = "Coefficient (Numerosity)",
fill = "",
color = "",
) +
scale_color_viridis(option = "mako", discrete = TRUE) +
theme(text = element_text(size = 15),
legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_hf_models = df_hf_models %>%
mutate(model_type = case_when(
str_detect(model_name, "clip") == TRUE ~ "VLM",
TRUE ~ "ViT")) %>%
group_by(model_name) %>%
mutate(max_layer = max(layer))
df_summary <- df_hf_models %>%
filter(layer == max_layer) %>%
group_by(model_type, numerosity_comparison_type) %>%
summarize(avg_similarity = mean(cosine_similarity, na.rm = TRUE),
se_similarity = sd(cosine_similarity, na.rm = TRUE) / sqrt(n()))
## `summarise()` has grouped output by 'model_type'. You can override using the
## `.groups` argument.
df_summary %>%
ggplot(aes(x = factor(model_type),
y = avg_similarity,
color = numerosity_comparison_type)) +
geom_point(position = position_dodge(width = 0.5), size = 2) +
geom_errorbar(aes(ymin = avg_similarity - 2 * se_similarity,
ymax = avg_similarity + 2 * se_similarity),
width = 0.2,
position = position_dodge(width = 0.5)) +
labs(# title = "",
x = "Model Type",
y = "Average Cosine Similarity",
color = "") +
theme_minimal() +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(axis.title = element_text(size=rel(1.2)),
axis.text = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
# legend.title = element_text(size = rel(1.5)),
strip.text.x = element_text(size = rel(1.2)),
legend.position = "bottom")
m1 = lmer(data = filter(df_hf_models, layer == max_layer),
cosine_similarity ~ area_diff * model_type + numerosity_comparison_type * model_type +
(1 | image_1) + (1 | image_2) + (1 | model_name))
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## cosine_similarity ~ area_diff * model_type + numerosity_comparison_type *
## model_type + (1 | image_1) + (1 | image_2) + (1 | model_name)
## Data: filter(df_hf_models, layer == max_layer)
##
## REML criterion at convergence: -13704.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.0206 -0.3721 0.0161 0.4847 3.9337
##
## Random effects:
## Groups Name Variance Std.Dev.
## image_1 (Intercept) 0.0013206 0.03634
## image_2 (Intercept) 0.0009146 0.03024
## model_name (Intercept) 0.0006441 0.02538
## Residual 0.0010780 0.03283
## Number of obs: 3840, groups: image_1, 480; image_2, 452; model_name, 3
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) 9.167e-01 2.574e-02 1.056e+00
## area_diff -9.625e-04 3.410e-05 1.394e+03
## model_typeVLM 3.622e-03 3.121e-02 1.015e+00
## numerosity_comparison_typesame 6.497e-02 3.458e-03 2.203e+03
## area_diff:model_typeVLM 6.546e-04 2.363e-05 3.121e+03
## model_typeVLM:numerosity_comparison_typesame -2.962e-02 2.844e-03 3.121e+03
## t value Pr(>|t|)
## (Intercept) 35.613 0.0148 *
## area_diff -28.220 <2e-16 ***
## model_typeVLM 0.116 0.9263
## numerosity_comparison_typesame 18.787 <2e-16 ***
## area_diff:model_typeVLM 27.698 <2e-16 ***
## model_typeVLM:numerosity_comparison_typesame -10.415 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ar_dff md_VLM nmrs__ a_:_VL
## area_diff -0.113
## modl_typVLM -0.810 0.037
## nmrsty_cmp_ -0.068 -0.026 0.030
## ar_dff:_VLM 0.065 -0.577 -0.064 0.013
## mdl_tVLM:__ 0.045 0.011 -0.044 -0.685 -0.020
df_hf_models %>%
filter(layer == max_layer) %>%
ggplot(aes(x = numerosity_diff,
y = cosine_similarity,
color = model_type)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_minimal() +
labs(x = "Numerosity Difference",
y = "Cosine Similarity",
color = "") +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 15),
legend.position = "bottom") +
facet_wrap(~model_type)
## `geom_smooth()` using formula = 'y ~ x'
m1 = lmer(data = filter(df_hf_models, layer == max_layer),
cosine_similarity ~ area_diff * model_type + numerosity_diff * model_type +
(1 | image_1) + (1 | image_2) + (1 | model_name))
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: cosine_similarity ~ area_diff * model_type + numerosity_diff *
## model_type + (1 | image_1) + (1 | image_2) + (1 | model_name)
## Data: filter(df_hf_models, layer == max_layer)
##
## REML criterion at convergence: -14074.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7464 -0.3629 0.0264 0.4708 4.1530
##
## Random effects:
## Groups Name Variance Std.Dev.
## image_1 (Intercept) 0.0007275 0.02697
## image_2 (Intercept) 0.0005041 0.02245
## model_name (Intercept) 0.0006442 0.02538
## Residual 0.0010601 0.03256
## Number of obs: 3840, groups: image_1, 480; image_2, 452; model_name, 3
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 9.831e-01 2.562e-02 1.036e+00 38.372 0.0146
## area_diff -9.616e-04 2.934e-05 1.922e+03 -32.773 <2e-16
## model_typeVLM -2.438e-02 3.120e-02 1.013e+00 -0.781 0.5764
## numerosity_diff -6.235e-03 2.145e-04 2.562e+03 -29.068 <2e-16
## area_diff:model_typeVLM 6.548e-04 2.344e-05 3.133e+03 27.940 <2e-16
## model_typeVLM:numerosity_diff 2.395e-03 1.859e-04 3.133e+03 12.884 <2e-16
##
## (Intercept) *
## area_diff ***
## model_typeVLM
## numerosity_diff ***
## area_diff:model_typeVLM ***
## model_typeVLM:numerosity_diff ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ar_dff md_VLM nmrst_ a_:_VL
## area_diff -0.100
## modl_typVLM -0.813 0.043
## numrsty_dff -0.046 0.019 0.024
## ar_dff:_VLM 0.066 -0.666 -0.065 -0.012
## mdl_tyVLM:_ 0.034 -0.011 -0.034 -0.722 0.017